Rerunning featureCounts without -O flag but still same gtf file as before to compare.
#! /bin/bash -l
#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name=align_qc
#SBATCH --time=00:30:00 # HH/MM/SS
#SBATCH --mem=8G
mamba activate angsd
featureCounts -a /athena/angsd/scratch/cnm4001/hg38/hg38.gencodev41.gtf \
-o /athena/angsd/scratch/cnm4001/NPC_counts/NPC_combined_featureCounts.txt \
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849597.Aligned.sortedByCoord.out.bam /athena/angsd/scratch/cnm4001/NPC_bams/SRR11849598.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849599.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849600.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849601.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849602.Aligned.sortedByCoord.out.bam
old_counts_summary <- read.table("/Users/Caden/Desktop/angsd/project/angsd_project/FeatureCounts/old_counts/NPC_combined_featureCounts.txt.summary", header = TRUE)
new_counts_summary <- read.table("/Users/Caden/Desktop/angsd/project/angsd_project/FeatureCounts/old_counts/reRun_NPC_combined_featureCounts.txt.summary", header = TRUE)
colnames(old_counts_summary) <- c("Status", seq(1:6))
colnames(new_counts_summary) <- c("Status", seq(1:6))
#show only non-zero rows
old_counts_summary[c(1,9,12,14), ]
## Status 1 2 3 4 5
## 1 Assigned 23249396 23504788 23677886 23703714 23886091
## 9 Unassigned_MultiMapping 20259038 18100320 18091774 17992087 17293837
## 12 Unassigned_NoFeatures 947714 1134073 784544 984225 938354
## 14 Unassigned_Ambiguity 0 0 0 0 0
## 6
## 1 23567663
## 9 18836084
## 12 876303
## 14 0
new_counts_summary[c(1,9,12,14), ]
## Status 1 2 3 4 5
## 1 Assigned 4235852 4747633 3232305 4742332 5132972
## 9 Unassigned_MultiMapping 20259038 18100320 18091774 17992087 17293837
## 12 Unassigned_NoFeatures 947714 1134073 784544 984225 938354
## 14 Unassigned_Ambiguity 19013544 18757155 20445581 18961382 18753119
## 6
## 1 3646303
## 9 18836084
## 12 876303
## 14 19921360
Seems like featureCounts also has lots of ambiguous reads. I would guess its the gtf issue.
#! /bin/bash -l
#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name=download
#SBATCH --time=02:00:00 # HH/MM/SS
#SBATCH --mem=8G
cd /athena/angsd/scratch/cnm4001/hg38
wget 'https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.annotation.gtf.gz' -O hg38.gencodev43.gtf.gz
#! /bin/bash -l
#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name=align_qc
#SBATCH --time=03:00:00 # HH/MM/SS
#SBATCH --mem=20G
cd /athena/angsd/scratch/cnm4001/NPC_alignmentQC
mkdir ReRun_Qorts
cd ReRun_Qorts
for index in 597 598 599 600 601 602; do
mkdir reRun_SRR11849${index}_alignmentQC
done
cd /athena/angsd/scratch/cnm4001/NPC_bams
mamba activate qorts
for index in 597 598 599 600 601 602; do
qorts -Xmx18000M QC \
--generatePlots \
--singleEnded \
SRR11849${index}.Aligned.sortedByCoord.out.bam \
/athena/angsd/scratch/cnm4001/hg38/hg38.gencodev43.gtf \
/athena/angsd/scratch/cnm4001/NPC_alignmentQC/reRun_SRR11849${index}_alignmentQC
done
Rerun Qorts Alignments.
It appears that the gtf was the issue since rerunning Qorts with new gtf has majority of reads uniquly aligning.
#! /bin/bash -l
#SBATCH --partition=angsd_class
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --job-name=align_qc
#SBATCH --time=00:30:00 # HH/MM/SS
#SBATCH --mem=8G
mamba activate angsd
featureCounts -a /athena/angsd/scratch/cnm4001/hg38/hg38.gencodev43.gtf \
-o /athena/angsd/scratch/cnm4001/NPC_counts/NPC_combined_featureCounts.txt \
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849597.Aligned.sortedByCoord.out.bam /athena/angsd/scratch/cnm4001/NPC_bams/SRR11849598.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849599.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849600.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849601.Aligned.sortedByCoord.out.bam
/athena/angsd/scratch/cnm4001/NPC_bams/SRR11849602.Aligned.sortedByCoord.out.bam
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
counts_matrix <- read.table("/Users/Caden/Desktop/angsd/project/angsd_project/FeatureCounts/new_counts/newGTF_NPC_combined_featureCounts.txt", header = TRUE, row.names = 1)
metadata<- read.csv("/Users/Caden/Downloads/SraRunTable-3.txt", header = TRUE)
counts_matrix <- counts_matrix[,-(1:5)]
new_sample_names <- c("Control_2", "Control_3", "Control_4", "GPE_2", "GPE_3", "GPE_4")
colnames(counts_matrix) <- new_sample_names
log_counts_matrix <-log2(counts_matrix)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
##
## subtract
sample_info <- data.frame(condition = gsub("_[0-9]+", "", colnames(counts_matrix)),
row.names = names(counts_matrix) )
#DEseq requires intergers
counts_matrix <- round(counts_matrix, 0)
#Create DEseq object
DESeq.ds <- DESeqDataSetFromMatrix(countData = as.matrix(counts_matrix),
colData = sample_info,
design = ~ condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#reads per sample
colSums(counts(DESeq.ds))
## Control_2 Control_3 Control_4 GPE_2 GPE_3 GPE_4
## 21347314 21655670 21627748 21783490 21977994 21565771
#remove genes with no reads
keep_genes <- rowSums(counts(DESeq.ds)) > 0
DESeq.ds <- DESeq.ds[ keep_genes, ]
dim(DESeq.ds)
## [1] 32213 6
#Size factor
DESeq.ds <- estimateSizeFactors(DESeq.ds) # calculate SFs, add them to object
plot( sizeFactors(DESeq.ds), colSums(counts(DESeq.ds)), # assess them
ylab = "library sizes", xlab = "size factors", cex = .6 )
par(mfrow=c(1,2))
## extracting normalized counts
counts.sf_normalized <- counts(DESeq.ds, normalized=TRUE)
## adding the boxplots
boxplot(counts(DESeq.ds), main = "read counts only", cex = .6)
boxplot(counts.sf_normalized, main = "SF normalized", cex = .6)
par(mfrow=c(1,2)) # to plot the two box plots next to each other
## bp of non-normalized
boxplot(log2(counts(DESeq.ds) +1), notch=TRUE,
main = "Non-normalized read counts",
ylab ="log2(read counts)", cex = .6)
## bp of size-factor normalized values
boxplot(log2(counts(DESeq.ds, normalized=TRUE) +1), notch=TRUE,
main = "Size-factor-normalized read counts",
ylab ="log2(read counts)", cex = .6)
## non-normalized read counts plus pseudocount
log.counts <- log2(counts(DESeq.ds, normalized = FALSE) + 1)
## instead of creating a new object, we could assign the values to a distinct matrix
## within the DESeq.ds object
assay(DESeq.ds, "log.counts") <- log2(counts(DESeq.ds, normalized = FALSE) + 1)
## normalized read counts
assay(DESeq.ds, "log.norm.counts") <- log2(counts(DESeq.ds, normalized=TRUE) + 1)
par(mfrow=c(1,2))
DESeq.ds[, c("Control_2","Control_3")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "Control_2 vs Control_3")
DESeq.ds[, c("GPE_2","GPE_3")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "GPE_2 vs GPE_3")
par(mfrow=c(1,3))
DESeq.ds[, c("Control_2","Control_3")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "Control_2 vs Control_3")
DESeq.ds[, c("Control_2","Control_4")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "Control_2 vs Control_4")
DESeq.ds[, c("Control_3","Control_4")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "Control_3 vs Control_4")
par(mfrow=c(1,3))
DESeq.ds[, c("GPE_2","Control_3")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "GPE_2 vs GPE_3")
DESeq.ds[, c("GPE_2","GPE_4")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "GPE_2 vs GPE_4")
DESeq.ds[, c("GPE_3","GPE_4")] %>%
assay(., "log.norm.counts") %>%
plot(., cex=.1, main = "GPE_3 vs GPE_4")
DESeq.rlog <- rlog(DESeq.ds, blind = FALSE) #since being treated with drug, I am trying blind
par(mfrow=c(1,2))
plot(assay(DESeq.ds, "log.norm.counts")[,1:2], cex=.1,
main = "size factor and log2-transformed")
## the rlog-transformed counts are stored in the "assay" accessor
plot(assay(DESeq.rlog)[,1:2],
cex=.1, main = "rlog transformed",
xlab = colnames(assay(DESeq.rlog[,1])),
ylab = colnames(assay(DESeq.rlog[,2])) )
library(pcaExplorer)
pcaExplorer(dds = DESeq.ds, dst = DESeq.rlog)
PCA.
heatmap.